W ramach rego skryptu zastaną przetestowane możliwości algorytmu KNN w zakresie klasyfikacji i regresji. Celem będzie:
klasyfikacja: clicks_per_view = 0
vs. clicks_per_view > 0
regresja: przewidywanie wskaźnika
clicks_per_view
kombinacja obu metod: przewidywanie, czy
clicks_per_view > 0, a następnie regresja dokładnej
wartości wskaźnika
Ze względu na duży rozmiar danych (~30 000 wierszy) w ramach analizy
wykorzystany zostanie algorytm Fast KNN (fknn).
## title word_count log_word_count
## 0 mathematics 8055 8.994172
## 1 language of mathematics 797 6.682109
## 2 abacus 3991 8.292048
## 3 roman abacus 2018 7.610358
## 4 tlalcuahuitl 152 5.030438
## 5 babylonian mathematics 1975 7.588830
## image_titles
## 0 ['arithmetic symbols.svg', 'bakhshali numerals 2.jpg', 'carl friedrich gauss 1840 by jensen.jpg', 'cauchy sequence illustration.svg', 'commons-logo.svg', 'fieldsmedalfront.jpg', 'giant pufferfish skin pattern detail.jpg', 'godfreykneller-isaacnewton-1689.jpg', 'gottfried wilhelm leibniz, bernhard christoph francke.jpg', 'illustrationcentraltheorem.png']
## 1 ['question book-new.svg']
## 2 ['1543 robert recorde.png', 'abacus (psf).png', 'abacus 6.png', 'adam riesen.jpg', 'bbinary abacus 002.jpg', 'commons-logo.svg', 'houghton typ 520.03.736 - margarita philosophica.jpg', 'kugleramme.jpg', 'köbel böschenteyn 1514.jpg', 'nuvola apps edu mathematics blue-p.svg']
## 3 ['abacus usages.jpg', 'ambox important.svg', 'commons-logo.svg', 'nuvola apps edu mathematics blue-p.svg', 'people icon.svg', 'romanabacusrecon.jpg', 'roman tablet employed in making arithmetical calculations (14781129921).jpg', 'symbol category class.svg']
## 4 ['cemacolli one third tlalcuahuitl aztec glyph.png', 'cemmatl thee fifths of a tlalcuahuitl.png', 'cemmitl one half of tlalcuahuitl aztec glyph.png', 'cemomitl one fifth of tlalcuahutil aztec glyph.png', 'cenyollotli two fifths of a tlalcuahuitl aztec glyph.png']
## 5 ['asia (orthographic projection).svg', 'clay tablet, mathematical, geometric-algebraic, similar to the euclidean geometry. from tell harmal, iraq. 2003-1595 bce. iraq museum.jpg\\', "file:clay tablet, mathematical, geometric-algebraic, similar to the pythagorean theorem. from tell al-dhabba\\'i, iraq. 2003-1595 bce. iraq museum.jpg", \\'file:nuvola apps edu mathematics blue-p.svg', 'symbol category class.svg', 'ybc-7289-obv-labeled.jpg']
## image_titles_string
## 0 arithmetic symbols.svg, bakhshali numerals 2.jpg, carl friedrich gauss 1840 by jensen.jpg, cauchy sequence illustration.svg, commons-logo.svg, fieldsmedalfront.jpg, giant pufferfish skin pattern detail.jpg, godfreykneller-isaacnewton-1689.jpg, gottfried wilhelm leibniz, bernhard christoph francke.jpg, illustrationcentraltheorem.png
## 1 question book-new.svg
## 2 1543 robert recorde.png, abacus (psf).png, abacus 6.png, adam riesen.jpg, bbinary abacus 002.jpg, commons-logo.svg, houghton typ 520.03.736 - margarita philosophica.jpg, kugleramme.jpg, köbel böschenteyn 1514.jpg, nuvola apps edu mathematics blue-p.svg
## 3 abacus usages.jpg, ambox important.svg, commons-logo.svg, nuvola apps edu mathematics blue-p.svg, people icon.svg, romanabacusrecon.jpg, roman tablet employed in making arithmetical calculations (14781129921).jpg, symbol category class.svg
## 4 cemacolli one third tlalcuahuitl aztec glyph.png, cemmatl thee fifths of a tlalcuahuitl.png, cemmitl one half of tlalcuahuitl aztec glyph.png, cemomitl one fifth of tlalcuahutil aztec glyph.png, cenyollotli two fifths of a tlalcuahuitl aztec glyph.png
## 5 asia (orthographic projection).svg, clay tablet, mathematical, geometric-algebraic, similar to the euclidean geometry. from tell harmal, iraq. 2003-1595 bce. iraq museum.jpg', "file:clay tablet, mathematical, geometric-algebraic, similar to the pythagorean theorem. from tell al-dhabba'i, iraq. 2003-1595 bce. iraq museum.jpg", 'file:nuvola apps edu mathematics blue-p.svg, symbol category class.svg, ybc-7289-obv-labeled.jpg
## num_images log_num_images mo_page_views log_mo_page_views clicks_in
## 0 10 2.3978953 160941 11.988799 117924
## 1 1 0.6931472 3135 8.050703 3967
## 2 10 2.3978953 44004 10.692059 42122
## 3 8 2.1972246 2568 7.851272 1608
## 4 5 1.7917595 146 4.990433 51
## 5 6 1.9459101 8715 9.072916 5874
## log_clicks_in clicks_out log_clicks_out clicks_per_view log_clicks_per_view
## 0 11.677804 54025 10.897221 0.33568202 0.28944204
## 1 8.286017 540 6.293419 0.17224880 0.15892396
## 2 10.648349 4532 8.419139 0.10299064 0.09802525
## 3 7.383368 155 5.049856 0.06035826 0.05860683
## 4 3.951244 0 0.000000 0.00000000 0.00000000
## 5 8.678461 1268 7.145984 0.14549627 0.13583797
## is_mo_page_views_zero is_clicks_in_zero is_clicks_out_zero
## 0 False False False
## 1 False False False
## 2 False False False
## 3 False False False
## 4 False False True
## 5 False False False
## is_clicks_per_view_zero type
## 0 False mathematics
## 1 False mathematics
## 2 False mathematics
## 3 False mathematics
## 4 True mathematics
## 5 False mathematics
## categories
## 0 ['all wikipedia articles written in american english', 'all articles with dead external links', 'all articles with failed verification', 'articles containing ancient greek (to 1453)-language text', 'articles containing greek-language text', 'articles containing latin-language text', 'articles with dead external links from october 2025', 'articles with failed verification from october 2024', 'articles with short description', 'cs1 german-language sources (de)', 'cs1 errors: isbn date', 'cs1 errors: periodical ignored', 'formal sciences', 'main topic articles', 'mathematics', 'pages using multiple image with manual scaled images', 'pages using sidebar with the child parameter', 'short description is different from wikidata', 'use american english from august 2022', 'use mdy dates from october 2024', 'webarchive template archiveis links', 'webarchive template wayback links', 'wikipedia indefinitely move-protected pages', 'wikipedia indefinitely semi-protected pages']
## 1 ['all articles needing additional references', 'articles needing additional references from june 2022', 'articles with short description', 'language', 'mathematics', 'short description is different from wikidata']
## 2 ['abacus', 'all wikipedia articles written in american english', 'all articles with unsourced statements', 'ancient roman mathematics', 'articles containing ancient greek (to 1453)-language text', 'articles containing chinese-language text', 'articles containing japanese-language text', 'articles containing latin-language text', 'articles containing russian-language text', 'articles with short description', 'articles with unsourced statements from april 2024', 'cs1: long volume value', 'cs1 chinese-language sources (zh)', 'cs1 greek-language sources (el)', 'cs1 korean-language sources (ko)', 'cs1 latin-language sources (la)', 'cs1 spanish-language sources (es)', 'cs1 errors: isbn date', 'cs1 uses korean-language script (ko)', 'chinese mathematics', 'commons link from wikidata', 'egyptian mathematics', 'greek mathematics', 'indian mathematics', 'japanese mathematics', 'korean mathematics', 'mathematical tools', 'pages with nahuatl languages ipa', 'short description matches wikidata', 'use american english from may 2021', 'use mdy dates from june 2013', 'wikipedia articles incorporating a citation from eb9', 'wikipedia articles incorporating a citation from the 1911 encyclopaedia britannica with wikisource reference']
## 3 ['abacus', 'all articles that may contain original research', 'all articles with unsourced statements', 'ancient roman mathematics', 'ancient roman technology', 'articles that may contain original research from march 2024', 'articles with short description', 'articles with unsourced statements from march 2022', 'cs1: unfit url', 'cs1 german-language sources (de)', 'short description matches wikidata']
## 4 ['articles with short description', 'aztec mathematics', 'pages with nahuatl languages ipa', 'short description matches wikidata', 'units of length']
## 5 ['articles with short description', 'babylonian mathematics', 'cs1 maint: multiple names: authors list', 'mathematics of ancient history', 'short description is different from wikidata', 'use dmy dates from february 2024', 'webarchive template wayback links']
## categories_string
## 0 all wikipedia articles written in american english, all articles with dead external links, all articles with failed verification, articles containing ancient greek (to 1453)-language text, articles containing greek-language text, articles containing latin-language text, articles with dead external links from october 2025, articles with failed verification from october 2024, articles with short description, cs1 german-language sources (de), cs1 errors: isbn date, cs1 errors: periodical ignored, formal sciences, main topic articles, mathematics, pages using multiple image with manual scaled images, pages using sidebar with the child parameter, short description is different from wikidata, use american english from august 2022, use mdy dates from october 2024, webarchive template archiveis links, webarchive template wayback links, wikipedia indefinitely move-protected pages, wikipedia indefinitely semi-protected pages
## 1 all articles needing additional references, articles needing additional references from june 2022, articles with short description, language, mathematics, short description is different from wikidata
## 2 abacus, all wikipedia articles written in american english, all articles with unsourced statements, ancient roman mathematics, articles containing ancient greek (to 1453)-language text, articles containing chinese-language text, articles containing japanese-language text, articles containing latin-language text, articles containing russian-language text, articles with short description, articles with unsourced statements from april 2024, cs1: long volume value, cs1 chinese-language sources (zh), cs1 greek-language sources (el), cs1 korean-language sources (ko), cs1 latin-language sources (la), cs1 spanish-language sources (es), cs1 errors: isbn date, cs1 uses korean-language script (ko), chinese mathematics, commons link from wikidata, egyptian mathematics, greek mathematics, indian mathematics, japanese mathematics, korean mathematics, mathematical tools, pages with nahuatl languages ipa, short description matches wikidata, use american english from may 2021, use mdy dates from june 2013, wikipedia articles incorporating a citation from eb9, wikipedia articles incorporating a citation from the 1911 encyclopaedia britannica with wikisource reference
## 3 abacus, all articles that may contain original research, all articles with unsourced statements, ancient roman mathematics, ancient roman technology, articles that may contain original research from march 2024, articles with short description, articles with unsourced statements from march 2022, cs1: unfit url, cs1 german-language sources (de), short description matches wikidata
## 4 articles with short description, aztec mathematics, pages with nahuatl languages ipa, short description matches wikidata, units of length
## 5 articles with short description, babylonian mathematics, cs1 maint: multiple names: authors list, mathematics of ancient history, short description is different from wikidata, use dmy dates from february 2024, webarchive template wayback links
## num_categories log_num_categories num_links_internal log_num_links_internal
## 0 24 3.218876 500 6.216606
## 1 6 1.945910 83 4.430817
## 2 33 3.526361 235 5.463832
## 3 11 2.484907 92 4.532599
## 4 5 1.791759 11 2.484907
## 5 7 2.079442 261 5.568345
## num_editors log_num_editors num_edits log_num_edits creation_date
## 0 284 5.652489 500 6.216606 2001-11-08t15:31:38z
## 1 170 5.141664 308 5.733341 2003-11-20t14:31:23z
## 2 279 5.634790 500 6.216606 2001-11-02t14:55:45z
## 3 158 5.068904 339 5.828946 2004-08-31t15:44:24z
## 4 9 2.302585 52 3.970292 2012-09-08t08:06:32z
## 5 276 5.624018 500 6.216606 2006-01-27t06:23:38z
## creation_date_timestamp log_creation_date_timestamp links_per_word
## 0 1005233498 20.72849 0
## 1 1069338683 20.79031 0
## 2 1004712945 20.72797 0
## 3 1093967064 20.81308 0
## 4 1347091592 21.02121 0
## 5 1138343018 20.85284 0
## log_links_per_word cat_mathematics_related_lists cat_archaeological_artifacts
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_number_theory cat_quantum_mechanics cat_dynamical_systems
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_20th_century_american_mathematicians cat_measurement cat_astrophysics
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_21st_century_american_mathematicians cat_pseudohistory
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## cat_creators_of_writing_systems cat_historiography cat_historical_eras
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_thermodynamics cat_nuclear_physics cat_death_conspiracy_theories
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_topology cat_cryptography cat_philosophers_of_history
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_mathematical_logic cat_condensed_matter_physics
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## cat_fellows_of_the_american_physical_society cat_physical_quantities
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## cat_living_people cat_mechanics cat_destroyed_populated_places
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## cat_pseudohistorians cat_fellows_of_the_american_mathematical_society
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## [1] 28627
## Warning: pakiet 'dplyr' został zbudowany w wersji R 4.4.3
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
## Joining with `by = join_by(title, summary, word_count, log_word_count,
## image_titles, image_titles_string, num_images, log_num_images, mo_page_views,
## log_mo_page_views, clicks_in, log_clicks_in, clicks_out, log_clicks_out,
## clicks_per_view, log_clicks_per_view, is_mo_page_views_zero, is_clicks_in_zero,
## is_clicks_out_zero, is_clicks_per_view_zero, type, categories,
## categories_string, num_categories, log_num_categories, num_links_internal,
## log_num_links_internal, num_editors, log_num_editors, num_edits, log_num_edits,
## creation_date, creation_date_timestamp, log_creation_date_timestamp,
## links_per_word, log_links_per_word, cat_mathematics_related_lists,
## cat_archaeological_artifacts, cat_number_theory, cat_quantum_mechanics,
## cat_dynamical_systems, cat_20th_century_american_mathematicians,
## cat_measurement, cat_astrophysics, cat_21st_century_american_mathematicians,
## cat_pseudohistory, cat_creators_of_writing_systems, cat_historiography,
## cat_historical_eras, cat_thermodynamics, cat_nuclear_physics,
## cat_death_conspiracy_theories, cat_topology, cat_cryptography,
## cat_philosophers_of_history, cat_mathematical_logic,
## cat_condensed_matter_physics, cat_fellows_of_the_american_physical_society,
## cat_physical_quantities, cat_living_people, cat_mechanics,
## cat_destroyed_populated_places, cat_pseudohistorians,
## cat_fellows_of_the_american_mathematical_society)`
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 955798 51.1 1901632 101.6 1379847 73.7
## Vcells 44161847 337.0 96768958 738.3 94009182 717.3
## Warning: pakiet 'gt' został zbudowany w wersji R 4.4.3
## Warning: pakiet 'purrr' został zbudowany w wersji R 4.4.2
library(tibble)
train %>% select(where(is.numeric))%>%
map_df(~ as.list(summary(.x))) %>%
mutate(variable = names(select(train, where(is.numeric)))) %>%
relocate(variable) %>% gt()| variable | Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|---|
| word_count | 1.000000e+00 | 2.022500e+02 | 4.830000e+02 | 1.244131e+03 | 1.351000e+03 | 4.167400e+04 |
| log_word_count | 6.931472e-01 | 5.314434e+00 | 6.182085e+00 | 6.278052e+00 | 7.209340e+00 | 1.063766e+01 |
| num_images | 0.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.300585e+00 | 5.000000e+00 | 1.000000e+01 |
| log_num_images | 0.000000e+00 | 6.931472e-01 | 1.098612e+00 | 1.223615e+00 | 1.791759e+00 | 2.397895e+00 |
| mo_page_views | 0.000000e+00 | 1.840000e+02 | 4.540000e+02 | 3.337458e+03 | 1.630500e+03 | 2.394312e+06 |
| log_mo_page_views | 0.000000e+00 | 5.220356e+00 | 6.120297e+00 | 6.415918e+00 | 7.397255e+00 | 1.468861e+01 |
| clicks_in | 0.000000e+00 | 4.300000e+01 | 1.880000e+02 | 2.593061e+03 | 9.867500e+02 | 7.274890e+05 |
| log_clicks_in | 0.000000e+00 | 3.784190e+00 | 5.241747e+00 | 5.199044e+00 | 6.895430e+00 | 1.349736e+01 |
| clicks_out | 0.000000e+00 | 0.000000e+00 | 1.300000e+01 | 7.503346e+02 | 1.800000e+02 | 3.190050e+05 |
| log_clicks_out | 0.000000e+00 | 0.000000e+00 | 2.639057e+00 | 2.816838e+00 | 5.198497e+00 | 1.267297e+01 |
| clicks_per_view | 0.000000e+00 | 0.000000e+00 | 2.727443e-02 | 8.082203e-02 | 1.192622e-01 | 9.974582e-01 |
| log_clicks_per_view | 0.000000e+00 | 0.000000e+00 | 2.690911e-02 | 7.193058e-02 | 1.126698e-01 | 6.918755e-01 |
| num_categories | 1.000000e+00 | 6.000000e+00 | 9.000000e+00 | 1.132224e+01 | 1.400000e+01 | 2.200000e+02 |
| log_num_categories | 6.931472e-01 | 1.945910e+00 | 2.302585e+00 | 2.338079e+00 | 2.708050e+00 | 5.398163e+00 |
| num_links_internal | 1.000000e+00 | 2.100000e+01 | 4.400000e+01 | 1.131882e+02 | 1.510000e+02 | 5.000000e+02 |
| log_num_links_internal | 6.931472e-01 | 3.091042e+00 | 3.806662e+00 | 4.061465e+00 | 5.023881e+00 | 6.216606e+00 |
| num_editors | 1.000000e+00 | 1.800000e+01 | 3.400000e+01 | 6.916571e+01 | 8.300000e+01 | 3.730000e+02 |
| log_num_editors | 6.931472e-01 | 2.944439e+00 | 3.555348e+00 | 3.705627e+00 | 4.430817e+00 | 5.924256e+00 |
| num_edits | 1.000000e+00 | 3.100000e+01 | 6.300000e+01 | 1.379969e+02 | 1.750000e+02 | 5.000000e+02 |
| log_num_edits | 6.931472e-01 | 3.465736e+00 | 4.158883e+00 | 4.322748e+00 | 5.170484e+00 | 6.216606e+00 |
| creation_date_timestamp | 9.797972e+08 | 1.124904e+09 | 1.212924e+09 | 1.271335e+09 | 1.403138e+09 | 1.727716e+09 |
| log_creation_date_timestamp | 2.070286e+01 | 2.084096e+01 | 2.091630e+01 | 2.095263e+01 | 2.106198e+01 | 2.127007e+01 |
| links_per_word | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.371059e-01 | 0.000000e+00 | 1.800000e+01 |
| log_links_per_word | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.853933e-02 | 0.000000e+00 | 2.944439e+00 |
| cat_mathematics_related_lists | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.379268e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_archaeological_artifacts | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 6.156668e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_number_theory | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.772247e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_quantum_mechanics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.135272e-02 | 0.000000e+00 | 1.000000e+00 |
| cat_dynamical_systems | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.562484e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_20th_century_american_mathematicians | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.558728e-02 | 0.000000e+00 | 1.000000e+00 |
| cat_measurement | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 6.549646e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_astrophysics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.073618e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_21st_century_american_mathematicians | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.737752e-02 | 0.000000e+00 | 1.000000e+00 |
| cat_pseudohistory | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.248275e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_creators_of_writing_systems | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.410095e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_historiography | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.977731e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_historical_eras | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.017116e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_thermodynamics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 8.427212e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_nuclear_physics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 8.558205e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_death_conspiracy_theories | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.501703e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_topology | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 8.863855e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_cryptography | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.553925e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_philosophers_of_history | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.628417e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_mathematical_logic | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.117282e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_condensed_matter_physics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.047943e-02 | 0.000000e+00 | 1.000000e+00 |
| cat_fellows_of_the_american_physical_society | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.763689e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_physical_quantities | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 8.296219e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_living_people | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 7.916339e-02 | 0.000000e+00 | 1.000000e+00 |
| cat_mechanics | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.938346e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_destroyed_populated_places | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.545367e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_pseudohistorians | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.589032e-03 | 0.000000e+00 | 1.000000e+00 |
| cat_fellows_of_the_american_mathematical_society | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.266003e-02 | 0.000000e+00 | 1.000000e+00 |
## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.4.3
library(stringr)
num_cols <- train %>%
select(where(is.numeric)) %>%
names()
cols_with_log <- num_cols[
paste0("log_", num_cols) %in% names(train)
]
cols_with_log <- setdiff(cols_with_log, "log_clicks_per_view")
plot_for_col <- function(col) {
log_col <- paste0("log_", col)
p1 <- ggplot(train, aes_string(x = col)) +
geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
ggtitle(paste("Histogram:", col))
p2 <- ggplot(train, aes_string(x = log_col)) +
geom_histogram(bins = 40, fill = "darkorange", alpha = 0.7) +
ggtitle(paste("Histogram:", log_col))
p3 <- ggplot(train, aes_string(x = col, y = "log_clicks_per_view")) +
geom_point(alpha = 0.4) +
ggtitle(paste("log_clicks_per_view vs", col))
p4 <- ggplot(train, aes_string(x = log_col, y = "log_clicks_per_view")) +
geom_point(alpha = 0.4) +
ggtitle(paste("log_clicks_per_view vs", log_col))
list(p1 = p1, p2 = p2, p3 = p3, p4 = p4)
}
all_plots <- map(cols_with_log, plot_for_col)## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: pakiet 'patchwork' został zbudowany w wersji R 4.4.3
num_cols <- train %>%
select(where(is.numeric)) %>%
names()
cols_with_log <- num_cols[
paste0("log_", num_cols) %in% names(train)
]
cols_with_log <- setdiff(cols_with_log, "log_clicks_per_view")
panel_for_col <- function(col) {
log_col <- paste0("log_", col)
cor_raw <- cor(train[[col]], train$log_clicks_per_view, use = "complete.obs")
cor_log <- cor(train[[log_col]], train$log_clicks_per_view, use = "complete.obs")
p1 <- ggplot(train, aes_string(x = col)) +
geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
ggtitle(paste("Histogram:", col))
p2 <- ggplot(train, aes_string(x = log_col)) +
geom_histogram(bins = 40, fill = "darkorange", alpha = 0.7) +
ggtitle(paste("Histogram:", log_col))
p3 <- ggplot(train, aes_string(x = col, y = "log_clicks_per_view")) +
geom_point(color = "steelblue", alpha = 0.1) +
annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("corr = ", round(cor_raw, 3)),
size = 4, color = "black") +
ggtitle(paste("log_clicks_per_view vs", col))
p4 <- ggplot(train, aes_string(x = log_col, y = "log_clicks_per_view")) +
geom_point(color = "darkorange", alpha = 0.1) +
annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("corr = ", round(cor_log, 3)),
size = 4, color = "black") +
ggtitle(paste("log_clicks_per_view vs", log_col))
(p1 | p2) /
(p3 | p4)
}
walk(cols_with_log, ~ print(panel_for_col(.x)))cols <- c("log_clicks_per_view", "log_word_count", "num_images", "num_categories", "log_num_links_internal", "log_num_editors", "num_edits", "creation_date_timestamp", "log_links_per_word")
log_cols <- num_cols[str_starts(num_cols, "log_")]
cols_with_log_version <- num_cols[ paste0("log_", num_cols) %in% names(train) ]
cols_without_logs <- setdiff( num_cols, union(log_cols, cols_with_log_version) )
features <- union(cols, cols_without_logs)
features <- setdiff(features, c("clicks_in" ,"clicks_per_view","mo_page_views","clicks_out"))
features## [1] "log_clicks_per_view"
## [2] "log_word_count"
## [3] "num_images"
## [4] "num_categories"
## [5] "log_num_links_internal"
## [6] "log_num_editors"
## [7] "num_edits"
## [8] "creation_date_timestamp"
## [9] "log_links_per_word"
## [10] "cat_mathematics_related_lists"
## [11] "cat_archaeological_artifacts"
## [12] "cat_number_theory"
## [13] "cat_quantum_mechanics"
## [14] "cat_dynamical_systems"
## [15] "cat_20th_century_american_mathematicians"
## [16] "cat_measurement"
## [17] "cat_astrophysics"
## [18] "cat_21st_century_american_mathematicians"
## [19] "cat_pseudohistory"
## [20] "cat_creators_of_writing_systems"
## [21] "cat_historiography"
## [22] "cat_historical_eras"
## [23] "cat_thermodynamics"
## [24] "cat_nuclear_physics"
## [25] "cat_death_conspiracy_theories"
## [26] "cat_topology"
## [27] "cat_cryptography"
## [28] "cat_philosophers_of_history"
## [29] "cat_mathematical_logic"
## [30] "cat_condensed_matter_physics"
## [31] "cat_fellows_of_the_american_physical_society"
## [32] "cat_physical_quantities"
## [33] "cat_living_people"
## [34] "cat_mechanics"
## [35] "cat_destroyed_populated_places"
## [36] "cat_pseudohistorians"
## [37] "cat_fellows_of_the_american_mathematical_society"
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(features)
##
## # Now:
## data %>% select(all_of(features))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
test <- select(test, features)
cols_to_scale <- setdiff(names(train), "log_clicks_per_view")
m <- sapply(train[cols_to_scale], mean)
s <- sapply(train[cols_to_scale], sd)
train[cols_to_scale] <- sweep(train[cols_to_scale], 2, m, "-")
train[cols_to_scale] <- sweep(train[cols_to_scale], 2, s, "/")
test[cols_to_scale] <- sweep(test[cols_to_scale], 2, m, "-")
test[cols_to_scale] <- sweep(test[cols_to_scale], 2, s, "/")##
## FALSE TRUE
## 10439 12463
##
## FALSE TRUE
## 2591 3134
Klasy są zbalansowane i podobnie rozłożone w zbiorze treningowym i testowym:)
Cross-validation liczby sąsiadów
## Warning: pakiet 'FNN' został zbudowany w wersji R 4.4.3
y <- ifelse(train$log_clicks_per_view > 0, 1, 0)
X <- train %>% select(-log_clicks_per_view)
set.seed(123)
K_folds <- 5
folds <- sample(rep(1:K_folds, length.out = nrow(X)))
k_values <- seq(1, 19, by = 2)
cv_results <- numeric(length(k_values))
for (i in seq_along(k_values)) {
k <- k_values[i]
acc_vec <- numeric(K_folds)
for (fold in 1:K_folds) {
train_idx <- which(folds != fold)
test_idx <- which(folds == fold)
X_tr <- X[train_idx, ]
X_te <- X[test_idx, ]
y_tr <- y[train_idx]
y_te <- y[test_idx]
pred <- knn(
train = X_tr,
test = X_te,
cl = y_tr,
k = k
)
acc_vec[fold] <- mean(pred == y_te)
}
cv_results[i] <- mean(acc_vec)
}
cv_table <- data.frame(
k = k_values,
accuracy = cv_results
)
cv_table## k accuracy
## 1 1 0.7247402
## 2 3 0.7529036
## 3 5 0.7605452
## 4 7 0.7671821
## 5 9 0.7713303
## 6 11 0.7738630
## 7 13 0.7742119
## 8 15 0.7756091
## 9 17 0.7746922
## 10 19 0.7751289
## Warning: pakiet 'plotly' został zbudowany w wersji R 4.4.3
##
## Dołączanie pakietu: 'plotly'
## Następujący obiekt został zakryty z 'package:ggplot2':
##
## last_plot
## Następujący obiekt został zakryty z 'package:stats':
##
## filter
## Następujący obiekt został zakryty z 'package:graphics':
##
## layout
plot_ly(
data = cv_table,
x = ~k,
y = ~accuracy,
type = "scatter",
mode = "lines+markers"
) %>%
layout(
title = "Cross Validation for k - klasyfikacja KNN",
xaxis = list(title = "k"),
yaxis = list(title = "Accuracy")
)Test modelu dla wybranego k
y_train <- ifelse(train$log_clicks_per_view > 0, 1, 0)
y_test <- ifelse(test$log_clicks_per_view > 0, 1, 0)
X_train <- train %>% select(-log_clicks_per_view)
X_test <- test %>% select(-log_clicks_per_view)
set.seed(123)
k <- 11
pred <- knn(
train = X_train,
test = X_test,
cl = y_train,
k = k
)
tab <- table(Predicted = pred, Actual = y_test)
tab## Actual
## Predicted 0 1
## 0 2017 747
## 1 574 2387
## [1] 0.7692576
y <- train$log_clicks_per_view
X <- train %>% select(-log_clicks_per_view)
set.seed(123)
K_folds <- 5
folds <- sample(rep(1:K_folds, length.out = nrow(X)))
k_values <- seq(5, 25, by = 2)
cv_rmse <- numeric(length(k_values))
for (i in seq_along(k_values)) {
k <- k_values[i]
rmse_vec <- numeric(K_folds)
for (fold in 1:K_folds) {
train_idx <- which(folds != fold)
test_idx <- which(folds == fold)
X_tr <- X[train_idx, ]
X_te <- X[test_idx, ]
y_tr <- y[train_idx]
y_te <- y[test_idx]
pred <- knn.reg(
train = X_tr,
test = X_te,
y = y_tr,
k = k
)$pred
rmse_vec[fold] <- sqrt(mean((pred - y_te)^2))
}
cv_rmse[i] <- mean(rmse_vec)
}
cv_reg_table <- data.frame(
k = k_values,
RMSE = cv_rmse
)
cv_reg_table## k RMSE
## 1 5 0.08998771
## 2 7 0.08819632
## 3 9 0.08740525
## 4 11 0.08682771
## 5 13 0.08642395
## 6 15 0.08611523
## 7 17 0.08585791
## 8 19 0.08581224
## 9 21 0.08576875
## 10 23 0.08575809
## 11 25 0.08574710
plot_ly(
data = cv_reg_table,
x = ~k,
y = ~RMSE,
type = "scatter",
mode = "lines+markers"
) %>%
layout(
title = "Cross Validation for k - regresja KNN",
xaxis = list(title = "k"),
yaxis = list(title = "RMSE")
)y_train <- train$log_clicks_per_view
y_test <- test$log_clicks_per_view
X_train <- train %>% select(-log_clicks_per_view)
X_test <- test %>% select(-log_clicks_per_view)
k <- 25
pred <- knn.reg(
train = X_train,
test = X_test,
y = y_train,
k = k
)$pred
rmse <- sqrt(mean((pred - y_test)^2))
mae <- mean(abs(pred - y_test))
ss_res <- sum((y_test - pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)
r2 <- 1 - ss_res/ss_tot
rmse## [1] 0.08629388
## [1] 0.05619395
## [1] 0.3141905
pred_train <- knn.reg(
train = X_train,
test = X_train,
y = y_train,
k = k
)$pred
pred_test <- pred
df_plot <- rbind(
data.frame(
set = "train",
actual = y_train,
pred = pred_train
),
data.frame(
set = "test",
actual = y_test,
pred = pred_test
)
)
ggplot(df_plot, aes(x = actual, y = pred, color = set, shape = set)) +
geom_point(alpha = 0.2, size = 2) +
scale_color_manual(values = c("train" = "blue", "test" = "green4")) +
scale_shape_manual(values = c("train" = 15, "test" = 17)) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
title = "Predicted vs Actual (train & test)",
x = "Actual",
y = "Predicted"
) +
theme_minimal()clicks per view > 0train_pos <- train %>% filter(log_clicks_per_view > 0)
test_pos <- test %>% filter(log_clicks_per_view > 0)
y_train <- train_pos$log_clicks_per_view
y_test <- test_pos$log_clicks_per_view
X_train <- train_pos %>% select(-log_clicks_per_view)
X_test <- test_pos %>% select(-log_clicks_per_view)
k <- 25
pred <- knn.reg(
train = X_train,
test = X_test,
y = y_train,
k = k
)$pred
rmse <- sqrt(mean((pred - y_test)^2))
mae <- mean(abs(pred - y_test))
ss_res <- sum((y_test - pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)
r2 <- 1 - ss_res/ss_tot
rmse## [1] 0.1003523
## [1] 0.07223938
## [1] 0.1579923
klasyfikacja, czy click per view jest dodatni
regresja dla wierszy sklasyfikowanych jako dodatnie
y_train <- train$log_clicks_per_view
y_test <- test$log_clicks_per_view
X_train <- train %>% select(-log_clicks_per_view)
X_test <- test %>% select(-log_clicks_per_view)
y_train_bin <- ifelse(y_train > 0, 1, 0)
y_test_bin <- ifelse(y_test > 0, 1, 0)
k_class <- 11
k_reg <- 25
pred_class <- knn(
train = X_train,
test = X_test,
cl = y_train_bin,
k = k_class
)
pred_reg <- rep(0, length(pred_class))
idx_pos <- which(pred_class == 1)
if (length(idx_pos) > 0) {
pred_reg[idx_pos] <- knn.reg(
train = X_train,
test = X_test[idx_pos, ],
y = y_train,
k = k_reg
)$pred
}
final_pred <- pred_reg
rmse <- sqrt(mean((final_pred - y_test)^2))
mae <- mean(abs(final_pred - y_test))
ss_res <- sum((y_test - final_pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)
r2 <- 1 - ss_res/ss_tot
rmse## [1] 0.0888856
## [1] 0.05124505
## [1] 0.2723771
Wizualizacja
pred_class_train <- knn(
train = X_train,
test = X_train,
cl = ifelse(y_train > 0, 1, 0),
k = k_class
)
final_pred_train <- rep(0, length(pred_class_train))
idx_pos_train <- which(pred_class_train == 1)
if (length(idx_pos_train) > 0) {
final_pred_train[idx_pos_train] <- knn.reg(
train = X_train,
test = X_train[idx_pos_train, ],
y = y_train,
k = k_reg
)$pred
}
pred_class_test <- knn(
train = X_train,
test = X_test,
cl = ifelse(y_train > 0, 1, 0),
k = k_class
)
final_pred_test <- rep(0, length(pred_class_test))
idx_pos_test <- which(pred_class_test == 1)
if (length(idx_pos_test) > 0) {
final_pred_test[idx_pos_test] <- knn.reg(
train = X_train,
test = X_test[idx_pos_test, ],
y = y_train,
k = k_reg
)$pred
}df_plot <- rbind(
data.frame(
set = "train",
actual = y_train,
pred = final_pred_train
),
data.frame(
set = "test",
actual = y_test,
pred = final_pred_test
)
)
ggplot(df_plot, aes(x = actual, y = pred, color = set, shape = set)) +
geom_point(alpha = 0.2, size = 2) +
scale_color_manual(values = c("train" = "blue", "test" = "green4")) +
scale_shape_manual(values = c("train" = 15, "test" = 17)) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
title = "Predicted vs Actual (train & test) — 2-step kNN model",
x = "Actual",
y = "Predicted"
) +
theme_minimal()Wniosek: proste podejście dawało bardzo podobne - jeśli nie lepsze - rezultaty.